library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ stringr 1.4.0
## ✓ tidyr 1.1.4 ✓ forcats 0.5.1
## ✓ readr 2.1.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)
library(formatR)
library(skimr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(lm.beta)
library(lsr)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(plot3D)
library(dotwhisker)
## Registered S3 method overwritten by 'parameters':
## method from
## format.parameters_distribution datawizard
knitr::opts_chunk$set(
echo = TRUE, # render output of code chunks,
warning = TRUE, # do not suppress "warnings"
message = TRUE, # do not suppress "messages"
title="#",
comment = "##", # prefix for comment lines
tidy = TRUE,
tidy.opts = list(blank = FALSE, width.cutoff = 75),
fig.path = "Kim et_al_photos/", # name of folder for images
fig.align = "center" # centers any images on the page
)
# Begin working on descriptive Stats
f <- "https://raw.githubusercontent.com/mrpickett26/Individual-Data-Analysis-Replication/main/tumor%20location%20raw%20data_final.csv"
d <- read_csv(f, col_names = TRUE)
## Rows: 1102 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Size_Pathol_cm_
## dbl (13): Subtype, Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overl...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(d)
## # A tibble: 6 × 15
## ID Size_Pathol_cm_ Subtype Qudrants2_1_UOQ_2_UIQ_3_… MR_Location_ML__1__me…
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1 1.50 1 2 2
## 2 2 3.00 1 2 2
## 3 3 1.60 1 1 2
## 4 4 1.80 1 5 2
## 5 5 3.00 1 1 2
## 6 6 3.50 3 1 2
## # … with 10 more variables:
## # MR_Location_AP__1__anterior_2__middle_3__posterior_ <dbl>,
## # Y_distance <dbl>, X_normalized <dbl>, Y_normalized <dbl>,
## # Z_normalized <dbl>, MG_grade <dbl>, X_distance <dbl>, Z_distance <dbl>,
## # Palpability <dbl>, Histologic_full <dbl>
skim(d) #getting some quick stats before filtering... based on raw data a lot of filtering needs to occur but useful for a couple variables
Data summary
|
|
|
|
Name
|
d
|
|
Number of rows
|
1102
|
|
Number of columns
|
15
|
|
_______________________
|
|
|
Column type frequency:
|
|
|
character
|
1
|
|
numeric
|
14
|
|
________________________
|
|
|
Group variables
|
None
|
Variable type: character
|
skim_variable
|
n_missing
|
complete_rate
|
min
|
max
|
empty
|
n_unique
|
whitespace
|
|
Size_Pathol_cm_
|
0
|
1
|
4
|
6
|
0
|
67
|
0
|
Variable type: numeric
|
skim_variable
|
n_missing
|
complete_rate
|
mean
|
sd
|
p0
|
p25
|
p50
|
p75
|
p100
|
hist
|
|
ID
|
0
|
1
|
551.50
|
318.26
|
1.0
|
276.25
|
551.5
|
826.75
|
1102.0
|
▇▇▇▇▇
|
|
Subtype
|
0
|
1
|
1.46
|
0.84
|
1.0
|
1.00
|
1.0
|
1.00
|
3.0
|
▇▁▁▁▂
|
|
Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping
|
0
|
1
|
2.05
|
1.25
|
1.0
|
1.00
|
2.0
|
3.00
|
5.0
|
▇▅▂▁▂
|
|
MR_Location_ML__1__medial_2__central_3__lateral_
|
0
|
1
|
2.20
|
0.77
|
1.0
|
2.00
|
2.0
|
3.00
|
3.0
|
▅▁▇▁▇
|
|
MR_Location_AP__1__anterior_2__middle_3__posterior_
|
0
|
1
|
2.37
|
0.64
|
1.0
|
2.00
|
2.0
|
3.00
|
3.0
|
▂▁▇▁▇
|
|
Y_distance
|
0
|
1
|
2.19
|
1.88
|
0.0
|
0.70
|
1.8
|
3.18
|
10.1
|
▇▃▂▁▁
|
|
X_normalized
|
0
|
1
|
0.05
|
0.18
|
-0.4
|
-0.10
|
0.1
|
0.20
|
0.5
|
▁▅▇▇▁
|
|
Y_normalized
|
0
|
1
|
0.24
|
0.18
|
0.0
|
0.10
|
0.2
|
0.40
|
0.9
|
▇▇▅▁▁
|
|
Z_normalized
|
0
|
1
|
0.06
|
0.13
|
-0.4
|
0.00
|
0.1
|
0.20
|
0.5
|
▁▃▇▃▁
|
|
MG_grade
|
0
|
1
|
2.78
|
0.84
|
1.0
|
2.00
|
3.0
|
3.00
|
4.0
|
▂▃▁▇▃
|
|
X_distance
|
0
|
1
|
0.45
|
1.51
|
-4.2
|
-0.70
|
0.5
|
1.60
|
5.5
|
▁▅▇▅▁
|
|
Z_distance
|
0
|
1
|
0.97
|
2.05
|
-5.5
|
-0.38
|
1.0
|
2.50
|
7.1
|
▁▃▇▅▁
|
|
Palpability
|
0
|
1
|
0.60
|
0.49
|
0.0
|
0.00
|
1.0
|
1.00
|
1.0
|
▅▁▁▁▇
|
|
Histologic_full
|
0
|
1
|
2.30
|
0.67
|
1.0
|
2.00
|
2.0
|
3.00
|
3.0
|
▂▁▇▁▇
|
knitr::opts_chunk$set(fig.path = "Kim_et_al_photos/")
# DESCRIPTIVE STATISTICS
## Want to separate data out into the two different subtypes (ER and
## Triple Negative), key factor in the paper. In this section I focus on
## the Triple Negative subtype. From this filtered dataset, I create
## variables for the number (counts) of each histological grade as well as
## the percentage of each histological grade within the population. I
## wrote a function that included n in it because later on, I will have to
## filter out some of the NAs that are in the raw dataset and this made it
## easier.
Triple_neg <- filter(d, Subtype == "3")
n <- nrow(Triple_neg)
T_Hist_grade_1 <- (length(which(Triple_neg$Histologic_full == "1")))
T_hist_grade_2 <- (length(which(Triple_neg$Histologic_full == "2")))
T_hist_grade_3 <- (length(which(Triple_neg$Histologic_full == "3")))
percent <- function(x) {
perc <- x/256
return(perc)
}
T_hist_grade1_percent <- percent(T_Hist_grade_1)
T_hist_grade2_percent <- percent(T_hist_grade_2)
T_hist_grade3_percent <- percent(T_hist_grade_3)
## T_Hist_grade tells the number of patients whose identified cancers
## belong to each type.
## Here I replace the #NULL! with NA so that I can filter it out into a
## new dataframe. I then convert it into a numeric so that it can be
## passed through the built in mean function.
ans <- Triple_neg %>%
replace(. == "#NULL!", NA)
g <- ans[!is.na(ans$Size_Pathol_cm_), ]
Mean_Size_T <- as.numeric(g$Size_Pathol_cm_)
Mean_size_Trip_neg <- mean(Mean_Size_T)
Mean_size_Trip_neg
## [1] 2.651528
## After finding the mean of the size of the triple negative tumor in cm
## after omitting all #NULL! values, I think the data set in the paper
## maybe replaced the Null values with 0 or used the n of 256 to calculate
## the mean because my mean is slightly higher
Range_size_trip_neg <- range(Mean_Size_T)
Range_size_trip_neg
## [1] 0.2 9.3
# find the range of values for the tumor size. The paper says that the
# range of the values for tumor size are (0.2-5.4), whereas my
# calculations indicate that the range extends up to 9.3.
# Palpability
Triple_neg$Palpability[Triple_neg$Palpability == "1"] <- "No"
Triple_neg$Palpability[Triple_neg$Palpability == "0"] <- "Yes"
## Changed the 1 and 0s found in Palpability to Yes and No values, and now
## count the number of yes and no values
No <- (length(which(Triple_neg$Palpability == "No")))
Yes <- (length(which(Triple_neg$Palpability == "Yes")))
Percent_palp <- percent(Yes)
Percent_not_palp <- percent(No)
## Calculated the number and percentage of palpable vs not palpable tumors
## in the Triple Negative subtype, using the function as defined above. I
## thought it was easiest to assign them into new variable in order to
## format them into the table properly.
## Now I will assign the numeric positional data to new values which will
## make it easier to filter out the data and do further analyses, again I
## thought variable assignment was the best for later calculations.
UOQ <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
"1")))
percent_UOQ <- percent(UOQ)
UIQ <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
"2")))
percent_UIQ <- percent(UIQ)
LOQ <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
"3")))
percent_LOQ <- percent(LOQ)
LIQ <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
"4")))
percent_LIQ <- percent(LIQ)
periareolar <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
"5")))
percent_periareolar <- percent(periareolar)
## Doing the assignments for mediolateral locations
medial <- (length(which(Triple_neg$MR_Location_ML__1__medial_2__central_3__lateral_ ==
"1")))
percent_medial <- percent(medial)
central <- (length(which(Triple_neg$MR_Location_ML__1__medial_2__central_3__lateral_ ==
"2")))
percent_central <- percent(central)
lateral <- (length(which(Triple_neg$MR_Location_ML__1__medial_2__central_3__lateral_ ==
"3")))
percent_lateral <- percent(lateral)
## Doing the same assignments for anteroposterior locations
anterior <- (length(which(Triple_neg$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
"1")))
percent_anterior <- percent(anterior)
middle <- (length(which(Triple_neg$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
"2")))
percent_middle <- percent(middle)
posterior <- (length(which(Triple_neg$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
"3")))
percent_posterior <- percent(posterior)
## Now I will filter the data for ER and conduct descriptive stats on it
ER_pos <- filter(d, Subtype == "1")
n_E <- nrow(ER_pos)
## Histological Grade filtering, and wrote a new function for percent to
## account for the change in n.
T_Hist_grade_1_E <- (length(which(ER_pos$Histologic_full == "1")))
T_hist_grade_2_E <- (length(which(ER_pos$Histologic_full == "2")))
T_hist_grade_3_E <- (length(which(ER_pos$Histologic_full == "3")))
percentE <- function(x) {
perc <- x/846
return(perc)
}
T_hist_grade1E_percent <- percentE(T_Hist_grade_1_E)
T_hist_grade2E_percent <- percentE(T_hist_grade_2_E)
T_hist_grade3E_percent <- percentE(T_hist_grade_3_E)
## In this, much like in the TN case, I replaced the NULL values with NA
## to remove them from the dataframe to make for easier wrangling.
ans <- ER_pos %>%
replace(. == "#NULL!", NA)
Eg <- ans[!is.na(ans$Size_Pathol_cm_), ]
Mean_Size_E <- as.numeric(Eg$Size_Pathol_cm_)
Mean_size_ER <- mean(Mean_Size_E)
Mean_size_ER
## [1] 2.410102
Range_size_ER <- range(Eg$Size_Pathol_cm_)
Range_size_ER
## [1] "0.20" "9.80"
## Again, the values I got for both the range and mean of the subtype
## differ from what is reported. Could be attributed to replacement with
## zeros or averaging with filtered data. Unsure why the range is
## different than reported.... The filtering was done correctly.
## Now ran the statistics for the palpability of ER data.
No_ER <- (length(which(ER_pos$Palpability == "1")))
Yes_ER <- (length(which(ER_pos$Palpability == "0")))
Percent_palpE <- percentE(Yes_ER)
Percent_not_palpE <- percentE(No_ER)
## Now filter out the ER data based on each quadrant and assign variables
## to each to put into a table. Calculate the percentages as well based on
## the function that I wrote previously.
UOQ_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
"1")))
percent_UOQ_E <- percentE(UOQ_E)
UIQ_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
"2")))
percent_UIQ_E <- percent(UIQ_E)
LOQ_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
"3")))
percent_LOQ_E <- percentE(LOQ_E)
LIQ_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
"4")))
percent_LIQ_E <- percentE(LIQ_E)
periareolar_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
"5")))
percent_periareolar_E <- percentE(periareolar_E)
## Do the same for mediolateral locations. Assign valulations to variables
## to be put in a table.
medial_E <- (length(which(ER_pos$MR_Location_ML__1__medial_2__central_3__lateral_ ==
"1")))
percent_medial_E <- percentE(medial_E)
central_E <- (length(which(ER_pos$MR_Location_ML__1__medial_2__central_3__lateral_ ==
"2")))
percent_central_E <- percentE(central_E)
lateral_E <- (length(which(ER_pos$MR_Location_ML__1__medial_2__central_3__lateral_ ==
"3")))
percent_lateral_E <- percent(lateral_E)
# ER now do the same assignments for anteroposterior locations
anterior_E <- (length(which(ER_pos$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
"1")))
percent_anterior_E <- percentE(anterior_E)
middle_E <- (length(which(ER_pos$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
"2")))
percent_middle_E <- percentE(middle_E)
posterior_E <- (length(which(ER_pos$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
"3")))
percent_posterior_E <- percentE(posterior_E)
# Make variables into a table that matches the one in the paper.
a <- c("Palpability_no", "Palpability_yes", "Upper Outer Quadrant", "Upper Inner Quadrant",
"Lower Outer Quadrant", "Lower Inner Quadrant", "Periareolar", "Mediolateral_Medial",
"Mediolateral_Central", "Mediolateral_Lateral", "Anteroposterior_Anterior",
"Anteroposterior_Middle", "Anteroposterior_Posterior", "Histological Grade 1",
"Histological Grade 2", "Histological Grade 3")
b <- c(Yes, No, UOQ, UIQ, LOQ, LIQ, periareolar, medial, central, lateral, anterior,
middle, posterior, T_Hist_grade_1, T_hist_grade_2, T_hist_grade_3)
c <- 100 * c(Percent_palp, Percent_not_palp, percent_UOQ, percent_UIQ, percent_LOQ,
percent_LIQ, percent_periareolar, percent_medial, percent_central, percent_lateral,
percent_anterior, percent_middle, percent_posterior, T_hist_grade1_percent,
T_hist_grade2_percent, T_hist_grade3_percent)
d <- c(Yes_ER, No_ER, UOQ_E, UIQ_E, LOQ_E, LIQ_E, periareolar_E, medial_E, central_E,
lateral_E, anterior_E, middle_E, posterior_E, T_Hist_grade_1_E, T_hist_grade_2_E,
T_hist_grade_3_E)
e <- 100 * c(Percent_palpE, Percent_not_palpE, percent_UOQ_E, percent_UIQ_E,
percent_LOQ_E, percent_LIQ_E, percent_periareolar_E, percent_medial_E, percent_central_E,
percent_lateral_E, percent_anterior_E, percent_middle_E, percent_posterior_E,
T_hist_grade1E_percent, T_hist_grade2E_percent, T_hist_grade3E_percent)
bind_col <- data.frame(Variables = a, Triple_Negative_Counts = b, Triple_Negative_Percentages = c,
ER_Counts = d, ER_Percentages = e)
kable(bind_col, digits = 3) %>%
kable_styling(font_size = 12, full_width = FALSE)
|
Variables
|
Triple_Negative_Counts
|
Triple_Negative_Percentages
|
ER_Counts
|
ER_Percentages
|
|
Palpability_no
|
60
|
23.438
|
379
|
44.799
|
|
Palpability_yes
|
196
|
76.562
|
467
|
55.201
|
|
Upper Outer Quadrant
|
126
|
49.219
|
375
|
44.326
|
|
Upper Inner Quadrant
|
73
|
28.516
|
225
|
87.891
|
|
Lower Outer Quadrant
|
26
|
10.156
|
112
|
13.239
|
|
Lower Inner Quadrant
|
18
|
7.031
|
61
|
7.210
|
|
Periareolar
|
13
|
5.078
|
73
|
8.629
|
|
Mediolateral_Medial
|
51
|
19.922
|
186
|
21.986
|
|
Mediolateral_Central
|
100
|
39.062
|
310
|
36.643
|
|
Mediolateral_Lateral
|
105
|
41.016
|
350
|
136.719
|
|
Anteroposterior_Anterior
|
17
|
6.641
|
80
|
9.456
|
|
Anteroposterior_Middle
|
99
|
38.672
|
396
|
46.809
|
|
Anteroposterior_Posterior
|
140
|
54.688
|
370
|
43.735
|
|
Histological Grade 1
|
2
|
0.781
|
125
|
14.775
|
|
Histological Grade 2
|
35
|
13.672
|
478
|
56.501
|
|
Histological Grade 3
|
219
|
85.547
|
243
|
28.723
|
bind_col
## Variables Triple_Negative_Counts Triple_Negative_Percentages
## 1 Palpability_no 60 23.437500
## 2 Palpability_yes 196 76.562500
## 3 Upper Outer Quadrant 126 49.218750
## 4 Upper Inner Quadrant 73 28.515625
## 5 Lower Outer Quadrant 26 10.156250
## 6 Lower Inner Quadrant 18 7.031250
## 7 Periareolar 13 5.078125
## 8 Mediolateral_Medial 51 19.921875
## 9 Mediolateral_Central 100 39.062500
## 10 Mediolateral_Lateral 105 41.015625
## 11 Anteroposterior_Anterior 17 6.640625
## 12 Anteroposterior_Middle 99 38.671875
## 13 Anteroposterior_Posterior 140 54.687500
## 14 Histological Grade 1 2 0.781250
## 15 Histological Grade 2 35 13.671875
## 16 Histological Grade 3 219 85.546875
## ER_Counts ER_Percentages
## 1 379 44.799054
## 2 467 55.200946
## 3 375 44.326241
## 4 225 87.890625
## 5 112 13.238771
## 6 61 7.210402
## 7 73 8.628842
## 8 186 21.985816
## 9 310 36.643026
## 10 350 136.718750
## 11 80 9.456265
## 12 396 46.808511
## 13 370 43.735225
## 14 125 14.775414
## 15 478 56.501182
## 16 243 28.723404
knitr::include_graphics("Kim_et_al_photos/count_table.jpg")

knitr::include_graphics("Kim_et_al_photos/location_table.jpg")

# More Descriptive Stats
## Need to filter out the normalized distance from the chest wall and
## distance from the chest wall, need to relplace the NULL with NA in
## order to have a tidy dataset that I can use for future calculations. I
## decided to filter all of the positional data, so that I could use it
## for visualization in the next step.
Triple_neg_normY <- Triple_neg$Y_normalized %>%
replace(. == "#NULL!", NA)
Triple_neg_normY <- na.omit(Triple_neg_normY)
m_Triple_neg_normY <- mean(Triple_neg_normY)
Triple_neg_normX <- Triple_neg$X_normalized %>%
replace(. == "#NULL!", NA)
Triple_neg_normX <- na.omit(Triple_neg_normX)
m_Triple_neg_normX <- mean(Triple_neg_normX)
Triple_neg_normZ <- Triple_neg$Z_normalized %>%
replace(. == "#NULL!", NA)
Triple_neg_normZ <- na.omit(Triple_neg_normZ)
m_Triple_neg_normZ <- mean(Triple_neg_normZ)
Triple_neg_Y <- Triple_neg$Y_distance %>%
replace(. == "#NULL!", NA)
Triple_neg_Y <- na.omit(Triple_neg_Y)
m_Triple_neg_Y <- mean(Triple_neg_Y)
Triple_neg_X <- Triple_neg$X_distance %>%
replace(. == "#NULL!", NA)
Triple_neg_X <- na.omit(Triple_neg_X)
m_Triple_neg_X <- mean(Triple_neg_X)
Triple_neg_Z <- Triple_neg$Z_distance %>%
replace(. == "#NULL!", NA)
Triple_neg_Z <- na.omit(Triple_neg_Z)
m_Triple_neg_Z <- mean(Triple_neg_Z)
## Do the same positional filtering for ER data.
ER_normY <- ER_pos$Y_normalized %>%
replace(. == "#NULL!", NA)
ER_normY <- na.omit(ER_normY)
m_ER_normY <- mean(ER_normY)
ER_normX <- ER_pos$X_normalized %>%
replace(. == "#NULL!", NA)
ER_normX <- na.omit(ER_normX)
m_ER_normX <- mean(ER_normX)
ER_normZ <- ER_pos$Z_normalized %>%
replace(. == "#NULL!", NA)
ER_normZ <- na.omit(ER_normZ)
m_ER_normZ <- mean(ER_normZ)
ER_Y <- ER_pos$Y_distance %>%
replace(. == "#NULL!", NA)
ER_Y <- na.omit(ER_Y)
m_ER_Y <- mean(ER_Y)
ER_X <- ER_pos$X_distance %>%
replace(. == "#NULL!", NA)
ER_X <- na.omit(ER_X)
m_ER_X <- mean(ER_X)
ER_Z <- ER_pos$Z_distance %>%
replace(. == "#NULL!", NA)
ER_Z <- na.omit(ER_Z)
m_ER_Z <- mean(ER_Z)
## Begin doing inferential stats, assume normal distribution for standard
## error and confidence intervals. I created a function for both of them
## and will use it to define variables in the future.
SE <- function(x, type = "normal") {
if (type == "normal") {
SE <- sd(x)/sqrt(length(x))
}
if (type == "poisson") {
SE <- sqrt(mean(x)/length(x))
# mean(x) is estimate of lambda
}
return(SE)
}
CI_norm <- function(x, alpha = 0.05) {
CI <- mean(x) + c(-1, 1) * qnorm(1 - alpha/2) * SE(x)
# confidence interval based on normal distribution
names(CI) <- c("CI norm L", "CI norm U")
return(CI)
}
## Now take the data and filter it by positional filters and pass through
## the functions needed to get the variables of interest for the summary
## table that will later be created.
ER_means <- ER_pos %>%
dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
X_normalized, Z_normalized), .fns = ~mean(.)))
ER_SD <- ER_pos %>%
dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
X_normalized, Z_normalized), .fns = ~sd(.)))
ER_SEs <- ER_pos %>%
dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
X_normalized, Z_normalized), .fns = ~SE(., type = "normal")))
ER_CI_norm <- ER_pos %>%
dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
X_normalized, Z_normalized), .fns = ~CI_norm(.)))
TN_means <- Triple_neg %>%
dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
X_normalized, Z_normalized), .fns = ~mean(.)))
TN_SD <- Triple_neg %>%
dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
X_normalized, Z_normalized), .fns = ~sd(.)))
TN_SEs <- Triple_neg %>%
dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
X_normalized, Z_normalized), .fns = ~SE(., type = "normal")))
TN_CI_norm <- Triple_neg %>%
dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
X_normalized, Z_normalized), .fns = ~CI_norm(.)))
## Not perform t-tests on on the variables/filtered data in order to
## extract the p-values needed for the tables.
X_norm <- t.test(ER_normX, Triple_neg_normX)
p_x_norm <- X_norm$p.value
Y_norm <- t.test(ER_normY, Triple_neg_normY)
p_y_norm <- Y_norm$p.value
Z_norm <- t.test(ER_normZ, Triple_neg_normZ)
p_Z_norm <- Z_norm$p.value
Z_dist <- t.test(ER_Z, Triple_neg_Z)
p_Z <- Z_dist$p.value
x_dist <- t.test(ER_X, Triple_neg_X)
p_x <- x_dist$p.value
y_dist <- t.test(ER_Y, Triple_neg_Y)
p_y <- y_dist$p.value
## Create the second descriptive table found in the paper
samp_1_summary <- as_tibble(t(bind_rows(ER_means, TN_means, ER_SD, TN_SD, ER_SEs,
TN_SEs, ER_CI_norm, TN_CI_norm)), .name_repair = "minimal")
names(samp_1_summary) <- c(" ER Mean", "TN Mean", "ER SD", "TN SD", "ER Std Error",
"TN Std Error", "ER Confidence Interval L", "ER Confidence Interval U",
"TN Confidence Interval L", "TN Confidence Interval U")
variables <- tibble(Variable = c("Y_distance", "X_distance", "Z_distance", "Y_normalized",
"X_normalized", "Z_normalized"))
samp_summary <- bind_cols(variables, samp_1_summary)
kable(samp_1_summary, digits = 3) %>%
kable_styling(font_size = 12, full_width = FALSE)
|
ER Mean
|
TN Mean
|
ER SD
|
TN SD
|
ER Std Error
|
TN Std Error
|
ER Confidence Interval L
|
ER Confidence Interval U
|
TN Confidence Interval L
|
TN Confidence Interval U
|
|
2.300
|
1.841
|
1.932
|
1.645
|
0.066
|
0.103
|
2.169
|
2.430
|
1.640
|
2.043
|
|
0.430
|
0.498
|
1.516
|
1.491
|
0.052
|
0.093
|
0.328
|
0.533
|
0.316
|
0.681
|
|
0.912
|
1.179
|
2.021
|
2.140
|
0.069
|
0.134
|
0.776
|
1.048
|
0.917
|
1.441
|
|
0.253
|
0.209
|
0.183
|
0.170
|
0.006
|
0.011
|
0.241
|
0.265
|
0.188
|
0.229
|
|
0.054
|
0.057
|
0.176
|
0.172
|
0.006
|
0.011
|
0.042
|
0.066
|
0.036
|
0.079
|
|
0.059
|
0.075
|
0.130
|
0.139
|
0.004
|
0.009
|
0.050
|
0.067
|
0.058
|
0.092
|
samp_summary
## # A tibble: 6 × 11
## Variable ` ER Mean` `TN Mean` `ER SD` `TN SD` `ER Std Error` `TN Std Error`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Y_distance 2.30 1.84 1.93 1.65 0.0664 0.103
## 2 X_distance 0.430 0.498 1.52 1.49 0.0521 0.0932
## 3 Z_distance 0.912 1.18 2.02 2.14 0.0695 0.134
## 4 Y_normaliz… 0.253 0.209 0.183 0.170 0.00630 0.0106
## 5 X_normaliz… 0.0538 0.0574 0.176 0.172 0.00605 0.0108
## 6 Z_normaliz… 0.0585 0.0746 0.130 0.139 0.00446 0.00867
## # … with 4 more variables: ER Confidence Interval L <dbl>,
## # ER Confidence Interval U <dbl>, TN Confidence Interval L <dbl>,
## # TN Confidence Interval U <dbl>
knitr::include_graphics("Kim_et_al_photos/distance_table.jpg")

# Inferential Stats Do inferential stats for the paper, no distinction
# between TN and ER for calculations for the beta coefficients Reading in
# the data again because so many variables were defined before that it is
# just simpler for my brain to refilter based off of what is needed from
# the total dataset
f <- "https://raw.githubusercontent.com/mrpickett26/Individual-Data-Analysis-Replication/main/tumor%20location%20raw%20data_final.csv"
d <- read_csv(f, col_names = TRUE)
## Rows: 1102 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Size_Pathol_cm_
## dbl (13): Subtype, Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overl...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ans2 <- d %>%
replace(. == "#NULL!", NA)
ans2 <- na.omit(ans2)
## Redefine variables from the filtered dataset and convert them to
## numeric so that I can do further calculations on them more easily
Size_pathol <- as.numeric(ans2$Size_Pathol_cm_)
Hist_grade <- as.numeric(ans2$Histologic_full)
Subtype <- as.numeric(ans2$Subtype)
## Define the absolute and normalized y distances as numeric variable for
## use in later calculations
normY_d <- ans2$Y_normalized ## normalized
Y_d <- ans2$Y_distance ##Absolute
## Hand calculate the beta coefficients of normalized y-axis and tumor
## size before doing any of the built in calculations.
beta1_size <- cor(normY_d, Size_pathol) * (sd(normY_d)/sd(Size_pathol))
beta1_size
## [1] -0.008155774
(beta0_size <- mean(normY_d) - beta1_size * mean(Size_pathol))
## [1] 0.2633268
beta0_size #do hand calculations of beta0 for normalized y distance am size of tumor
## [1] 0.2633268
## Now do standard error of b1 and b0 for normalized y distance am size of
## tumor by hand
n <- nrow(ans2)
df <- n - 2
mean_x <- mean(Size_pathol)
y_pred = beta0_size + beta1_size * normY_d
y_error = normY_d - y_pred
std_err_b1_dist <- sqrt((sum(y_error^2))/((n - 2) * sum((Size_pathol - mean_x)^2)))
std_err_b1_dist
## [1] 0.004522476
## I will use the lm function to evaluate the rest of the variables BUT if
## I was to do it by hand, it would follow the formats about for
## beta1_size, beta1_size_norm, beta0_size, and beta0_size_norm
## Now utilize the lm function to double check calculations and calculate
## the beta coefficients of absolute/normalized against selected variables
## (subtype, size, histological grade). I chose subtype because it is
## foundational to the paper, size becuase the data is continuous, and
## histological grade because it was good for me to practice categorical
## data analysis and it was in a relatively clean format.
m_size_path_norm <- lm(normY_d ~ Size_pathol, data = ans2)
summary(m_size_path_norm)
##
## Call:
## lm(formula = normY_d ~ Size_pathol, data = ans2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26170 -0.14620 -0.04294 0.14483 0.64646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.263327 0.012372 21.29 <2e-16 ***
## Size_pathol -0.008156 0.004457 -1.83 0.0675 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.181 on 1009 degrees of freedom
## Multiple R-squared: 0.003308, Adjusted R-squared: 0.00232
## F-statistic: 3.349 on 1 and 1009 DF, p-value: 0.06754
std_err_pn <- coef(summary(m_size_path_norm))[, "Std. Error"]
m_Hist_grade_norm <- lm(normY_d ~ Hist_grade, data = ans2)
summary(m_Hist_grade_norm) #This one seems to differ
##
## Call:
## lm(formula = normY_d ~ Hist_grade, data = ans2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27989 -0.15185 -0.02381 0.12011 0.67619
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.307928 0.020476 15.039 < 2e-16 ***
## Hist_grade -0.028039 0.008526 -3.289 0.00104 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1803 on 1009 degrees of freedom
## Multiple R-squared: 0.01061, Adjusted R-squared: 0.009625
## F-statistic: 10.82 on 1 and 1009 DF, p-value: 0.001041
std_err_hgn <- coef(summary(m_Hist_grade_norm))[, "Std. Error"]
m_Subtype_norm <- lm(normY_d ~ Subtype, data = ans2)
summary(m_Subtype_norm)
##
## Call:
## lm(formula = normY_d ~ Subtype, data = ans2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.25448 -0.15448 -0.05448 0.14552 0.69520
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.279312 0.011346 24.619 < 2e-16 ***
## Subtype -0.024836 0.006766 -3.671 0.000254 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1801 on 1009 degrees of freedom
## Multiple R-squared: 0.01318, Adjusted R-squared: 0.0122
## F-statistic: 13.48 on 1 and 1009 DF, p-value: 0.0002544
std_err_sn <- coef(summary(m_Subtype_norm))[, "Std. Error"]
m_size_path <- lm(Y_d ~ Size_pathol, data = ans2)
summary(m_size_path)
##
## Call:
## lm(formula = Y_d ~ Size_pathol, data = ans2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3013 -1.4678 -0.3680 0.9822 7.7988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.36809 0.12893 18.368 <2e-16 ***
## Size_pathol -0.06675 0.04644 -1.437 0.151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.886 on 1009 degrees of freedom
## Multiple R-squared: 0.002043, Adjusted R-squared: 0.001054
## F-statistic: 2.066 on 1 and 1009 DF, p-value: 0.1509
std_err_sp <- coef(summary(m_size_path))[, "Std. Error"]
m_Hist_grade <- lm(Y_d ~ Hist_grade, data = ans2)
summary(m_Hist_grade) #This one seems to differ
##
## Call:
## lm(formula = Y_d ~ Hist_grade, data = ans2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4707 -1.4899 -0.4091 0.9909 7.9909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.85152 0.21333 13.367 < 2e-16 ***
## Hist_grade -0.28079 0.08883 -3.161 0.00162 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.879 on 1009 degrees of freedom
## Multiple R-squared: 0.009806, Adjusted R-squared: 0.008825
## F-statistic: 9.993 on 1 and 1009 DF, p-value: 0.001619
std_err_hg <- coef(summary(m_Hist_grade))[, "Std. Error"]
m_Subtype <- lm(Y_d ~ Subtype, data = ans2)
summary(m_Subtype)
##
## Call:
## lm(formula = Y_d ~ Subtype, data = ans2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3162 -1.5162 -0.4162 0.9838 7.7838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.5650 0.1182 21.697 < 2e-16 ***
## Subtype -0.2487 0.0705 -3.528 0.000437 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.876 on 1009 degrees of freedom
## Multiple R-squared: 0.01219, Adjusted R-squared: 0.01121
## F-statistic: 12.45 on 1 and 1009 DF, p-value: 0.000437
std_err_s <- coef(summary(m_Subtype))[, "Std. Error"]
#'In statistics, standardized (regression) coefficients, also called beta coefficients or beta weights, are the estimates resulting from a regression analysis where the underlying data have been standardized so that the variances of dependent and independent variables are equal to 1.' source: https://en.wikipedia.org/wiki/Standardized_coefficient, the lm.beta package allows us to calculate this easily, and will be done for the previously used variables.
Std_beta_coeff_Y_norm_size <- lm.beta(m_size_path_norm)
summary(Std_beta_coeff_Y_norm_size)
##
## Call:
## lm(formula = normY_d ~ Size_pathol, data = ans2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26170 -0.14620 -0.04294 0.14483 0.64646
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 0.263327 0.000000 0.012372 21.29 <2e-16 ***
## Size_pathol -0.008156 -0.057517 0.004457 -1.83 0.0675 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.181 on 1009 degrees of freedom
## Multiple R-squared: 0.003308, Adjusted R-squared: 0.00232
## F-statistic: 3.349 on 1 and 1009 DF, p-value: 0.06754
coeff_size_norm <- Std_beta_coeff_Y_norm_size$coefficients
std_coeff_size_norm <- Std_beta_coeff_Y_norm_size$standardized.coefficients
Std_beta_coeff_Y_size <- lm.beta(m_size_path)
summary(Std_beta_coeff_Y_size)
##
## Call:
## lm(formula = Y_d ~ Size_pathol, data = ans2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3013 -1.4678 -0.3680 0.9822 7.7988
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 2.36809 0.00000 0.12893 18.368 <2e-16 ***
## Size_pathol -0.06675 -0.04520 0.04644 -1.437 0.151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.886 on 1009 degrees of freedom
## Multiple R-squared: 0.002043, Adjusted R-squared: 0.001054
## F-statistic: 2.066 on 1 and 1009 DF, p-value: 0.1509
coeff_size <- Std_beta_coeff_Y_size$coefficients
std_coeff_size <- Std_beta_coeff_Y_size$standardized.coefficients
std_beta_hist_norm <- lm.beta(m_Hist_grade_norm)
summary(std_beta_hist_norm)
##
## Call:
## lm(formula = normY_d ~ Hist_grade, data = ans2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27989 -0.15185 -0.02381 0.12011 0.67619
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 0.307928 0.000000 0.020476 15.039 < 2e-16 ***
## Hist_grade -0.028039 -0.102983 0.008526 -3.289 0.00104 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1803 on 1009 degrees of freedom
## Multiple R-squared: 0.01061, Adjusted R-squared: 0.009625
## F-statistic: 10.82 on 1 and 1009 DF, p-value: 0.001041
coeff_hist_norm <- std_beta_hist_norm$coefficients
std_coeff_hist_norm <- std_beta_hist_norm$standardized.coefficients
std_beta_hist <- lm.beta(m_Hist_grade)
summary(m_Hist_grade)
##
## Call:
## lm(formula = Y_d ~ Hist_grade, data = ans2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4707 -1.4899 -0.4091 0.9909 7.9909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.85152 0.21333 13.367 < 2e-16 ***
## Hist_grade -0.28079 0.08883 -3.161 0.00162 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.879 on 1009 degrees of freedom
## Multiple R-squared: 0.009806, Adjusted R-squared: 0.008825
## F-statistic: 9.993 on 1 and 1009 DF, p-value: 0.001619
coeff_hist <- std_beta_hist$coefficients
std_coeff_hist <- std_beta_hist$standardized.coefficients
std_beta_sub_norm <- lm.beta(m_Subtype_norm)
summary(std_beta_sub_norm)
##
## Call:
## lm(formula = normY_d ~ Subtype, data = ans2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.25448 -0.15448 -0.05448 0.14552 0.69520
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 0.279312 0.000000 0.011346 24.619 < 2e-16 ***
## Subtype -0.024836 -0.114801 0.006766 -3.671 0.000254 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1801 on 1009 degrees of freedom
## Multiple R-squared: 0.01318, Adjusted R-squared: 0.0122
## F-statistic: 13.48 on 1 and 1009 DF, p-value: 0.0002544
coeff_norm_sub <- std_beta_sub_norm$coefficients
std_coeff_norm_sub <- std_beta_sub_norm$standardized.coefficients
std_beta_sub <- lm.beta(m_Subtype)
summary(std_beta_sub)
##
## Call:
## lm(formula = Y_d ~ Subtype, data = ans2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3162 -1.5162 -0.4162 0.9838 7.7838
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 2.5650 0.0000 0.1182 21.697 < 2e-16 ***
## Subtype -0.2487 -0.1104 0.0705 -3.528 0.000437 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.876 on 1009 degrees of freedom
## Multiple R-squared: 0.01219, Adjusted R-squared: 0.01121
## F-statistic: 12.45 on 1 and 1009 DF, p-value: 0.000437
coeff_sub <- std_beta_sub$coefficients
std_coeff_sub <- std_beta_sub$standardized.coefficients
## Now creating the table that is seen in the paper for inferential stats
## (beta, standard err, standardized beta). Pulling all of the 'outputs'
## out of the summary table and creating a tibble to bind together into
## the larger table.
samp_2_summary <- as_tibble(t(bind_rows(coeff_hist, std_err_hg, std_coeff_hist)),
.name_repair = "minimal")
samp_2_summary1 <- as_tibble(t(bind_rows(coeff_hist_norm, std_err_hgn, std_coeff_hist_norm)),
.name_repair = "minimal")
samp_3_summary <- as_tibble(t(bind_rows(coeff_size, std_err_sp, std_coeff_size)),
.name_repair = "minimal")
samp_3_summary1 <- as_tibble(t(bind_rows(coeff_size_norm, std_err_pn, std_coeff_size_norm)),
.name_repair = "minimal")
samp_4_summary <- as_tibble(t(bind_rows(coeff_sub, std_err_s, std_coeff_sub)),
.name_repair = "minimal")
samp_4_summary1 <- as_tibble(t(bind_rows(coeff_norm_sub, std_err_sn, std_coeff_norm_sub)),
.name_repair = "minimal")
samp_2_summary <- samp_2_summary[-c(1), ]
samp_3_summary <- samp_3_summary[-c(1), ]
samp_4_summary <- samp_4_summary[-c(1), ]
samp_2_summary1 <- samp_2_summary1[-c(1), ]
samp_3_summary1 <- samp_3_summary1[-c(1), ]
samp_4_summary1 <- samp_4_summary1[-c(1), ]
Y_dist_data <- bind_rows(samp_2_summary, samp_3_summary, samp_4_summary, samp_2_summary1,
samp_3_summary1, samp_4_summary1)
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
names(Y_dist_data) <- c("Beta Coefficient", "Standard Error", "Standardized Beta Coefficient")
variables <- tibble(Variable = c("Absolute y-axis Histological Grade", "Normalized y-axis Histological Grade",
"Absolute y-axis Tumor Size", "Normalized y-axis Tumor Size", "Absolute y-axis Triple Negative",
" Normalized y-axis Triple Negative"))
Y_dist_data <- bind_cols(variables, Y_dist_data)
kable(Y_dist_data, digits = 3) %>%
kable_styling(font_size = 12, full_width = FALSE)
|
Variable
|
Beta Coefficient
|
Standard Error
|
Standardized Beta Coefficient
|
|
Absolute y-axis Histological Grade
|
-0.281
|
0.089
|
-0.099
|
|
Normalized y-axis Histological Grade
|
-0.067
|
0.046
|
-0.045
|
|
Absolute y-axis Tumor Size
|
-0.249
|
0.070
|
-0.110
|
|
Normalized y-axis Tumor Size
|
-0.028
|
0.009
|
-0.103
|
|
Absolute y-axis Triple Negative
|
-0.008
|
0.004
|
-0.058
|
|
Normalized y-axis Triple Negative
|
-0.025
|
0.007
|
-0.115
|
Y_dist_data
## # A tibble: 6 × 4
## Variable `Beta Coefficie… `Standard Error` `Standardized Beta Co…
## <chr> <dbl> <dbl> <dbl>
## 1 "Absolute y-axis His… -0.281 0.0888 -0.0990
## 2 "Normalized y-axis H… -0.0668 0.0464 -0.0452
## 3 "Absolute y-axis Tum… -0.249 0.0705 -0.110
## 4 "Normalized y-axis T… -0.0280 0.00853 -0.103
## 5 "Absolute y-axis Tri… -0.00816 0.00446 -0.0575
## 6 " Normalized y-axis … -0.0248 0.00677 -0.115
knitr::include_graphics("Kim_et_al_photos/inferential_table.jpg")

# DATA VISUALIZATION Do data visualization now. No real visualization in
# the paper, so I chose a 3D plot of the positional data of the different
# cancer subtypes for both normalized and absolute. This allows me to
# visualize the distribution of where each tumor is located and how the
# normalization shifts the positions of the data.
Norm_ER__plot <- scatter3D(ER_normX, ER_normY, ER_normZ, clab = c("ER Normalized",
"Position (cm)"))

Abs_ER__plot <- scatter3D(ER_X, ER_Y, ER_Z, clab = c(" ER Absolute", "Position (cm)"))

Norm_TN__plot <- scatter3D(Triple_neg_normX, Triple_neg_normY, Triple_neg_normZ,
clab = c("TN Normalized", "Position (cm)"))

Abs_TN__plot <- scatter3D(Triple_neg_X, Triple_neg_Y, Triple_neg_Z, clab = c("TN Absolute",
"Position (cm)"))

## When the positional data is normalized, it looks very 'normal' (go
## figure!), there is no skewness to it, which makes sense since it all is
## normalized within the spread of the data, so the center should be at
## point zero.
## Because the data is categorical, I decided that it didn't make much
## sense to plot all of them as scatter plots, but that a dot and whisker
## plot might be more intuitive. The dots represent the regression
## coefficients of the two separate models and the whiskers represent the
## 99% confidence intervals (as specified in the code)
dwplot(list(m_Hist_grade, m_Hist_grade_norm, m_Subtype, m_Subtype_norm, m_size_path,
m_size_path_norm), ci = 0.99, show_intercept = FALSE) + scale_y_discrete(labels = c(Subtype = "",
Hist_grade = "Histological Grade", Size_pathol = "Subtype")) + scale_color_hue(labels = c("Normalized y-axis position",
"Absolute y-axis position", "Normalized y-axis position", "Absolute y-axis position",
"Normalized y-axis position", "Absolute y-axis position")) + theme(plot.title = element_text(face = "bold")) +
ggtitle("Regression Coefficients for Breast Tumor Y-axis positions") + xlab("Standardized Beta Coefficient")

## I don't know why the labels on the y axis have to be switched to be
## correct... I went back and checked and the models earlier and they are
## labelled correctly and everything. Not super familiar with dotwhisker.
# Normalized y-axis and Tumor Size
plot(m_size_path_norm)




# Absolute y-axis and Tumor Size
plot(m_size_path)



